# Preprocessing Data
setwd("/Users/youyang/Downloads")
# Importing the data
library(readxl)
edu_wide <- read_excel("Education.xls", skip=5)
edu_wide <- edu_wide[,-grep("x_", names(edu_wide))]
edu_wide <- as.data.frame(edu_wide)
# Convert from wide to long
library(reshape)
edu_long <- stats::reshape(edu_wide,
varying = names(edu_wide)[4:23],
direction = "long",
idvar = c("fips_code"),
timevar = "year",
sep = "_")
# Identify states and counties
edu_long$area_type <- ifelse(as.numeric(substr(edu_long$fips_code,3,5))!=0,"county","state")
edu_long$area_type <- ifelse(as.numeric(substr(edu_long$fips_code,1,5))==0,"country",edu_long$area_type)
##### Setup
library(plotly)
library(ggthemes)
library(dplyr)
library(ggplot2)
d_country <- edu_long[edu_long$area_type=="country", ]
p_country <- plot_ly(d_country, x = ~year, y = ~lessHS, type = "bar",
name = "<High school", color = I("palegreen")) %>%
add_trace(y = ~HS, name = "High school",
color = I("orange")) %>%
add_trace(y = ~somecoll, name = "Some College",
color = I("lightblue2")) %>%
add_trace(y = ~coll, name = ">=College",
color = I("lightpink")) %>%
layout(title = "1970 - 2015 US Education Level",
yaxis = list(title = 'Percentage'),
xaxis = list(title = 'Year'),
barmode = 'stack',
legend = list("Less than a high school diploma",
"High school diploma only",
"Some college (1-3 years)",
"Four years of college or higher"))
p_country
# Seperate by Region
d_state <- edu_long[edu_long$area_type=="state", ]
west <- c("WA", "OR", "ID", "MT", "WY", "CO", "UT", "NV", "CA", "AZ", "NM")
midwest <- c("ND", "SD", "NE", "KS", "MO", "IA", "MN", "WI", "IL", "MI", "IN", "OH")
south <- c("OK", "TX", "LA", "AR", "MS", "KY", "TN", "AL", "GA", "SC", "FL", "NC", "VA", "DC", "MD", "DE", "WV")
northeast <- c("NY", "ME", "VT", "NH", "CT", "NJ", "RI", "PA", "MA")
other <- c("AK", "HI")
d_state$region <- ifelse(d_state$state %in% west, "west",
ifelse(d_state$state %in% midwest, "midwest",
ifelse(d_state$state %in% south, "south",
ifelse(d_state$state %in% northeast, "northeast",
"others"))))
p_state_region <- ggplot(d_state, aes(x = year, color = state, y = coll,
label = somecoll, label2 = HS,label3 = lessHS)) +
geom_point(alpha = 0.5, size = 1) +
geom_smooth(lwd=0.2) +
facet_grid(region~.) +
theme_minimal() +
xlab("Year") +
ylab("Percent College and Above") +
ggtitle("1970-2015 US Education Level by Region") +
theme(plot.title = element_text(hjust = 0.5, size = 15))
ggplotly(p_state_region)
# Data Table
d_county <- edu_long[edu_long$year == 2015 & edu_long$area_type == "county", ]
d_county$pct_lessHS <- ecdf(d_county$lessHS)(d_county$lessHS)
d_county$pct_HS <- ecdf(d_county$HS)(d_county$HS)
d_county$pct_somecoll <- ecdf(d_county$somecoll)(d_county$somecoll)
d_county$pct_coll <- ecdf(d_county$coll)(d_county$coll)
library(scales)
d_county$overall <- percent(rowMeans(d_county[ ,10:13]))
county <- select(d_county, state, county = area_name, "Education Percentile" = overall)
rownames(county) <- NULL
library(DT)
datatable(county)
# Leaflet
d_county$overall <- rowMeans(d_county[ ,10:13])
library(leaflet)
## Get U.S. county shape files from U.S. census department
library(tigris)
counties <- counties(state = "New York")
combined <- counties@data %>%
left_join(d_county, by = c(NAMELSAD = "area_name"))
counties@data <- combined
# Color by quantile
m = leaflet(counties) %>%
setView(lng = -75, lat = 42.5, zoom = 6) %>%
# Base Groups = Background layer
addTiles(group = "OpenStreetMap") %>%
addProviderTiles(providers$Stamen.Toner, group = "Toner") %>%
addProviderTiles(providers$Stamen.TonerLite, group = "Toner Lite") %>%
# Data Layers
# First Data Layer: Overall Education Level
addPolygons(group = "Overall Education Level",
stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5,
color = ~colorQuantile("Blues", overall)(overall),
popup = paste("State:", counties$state ,"<br/>",
"County:",counties$NAME,"<br/>",
"Overall Education Percentile:", percent(counties$overall), "<br/>",
"College and Beyond:", counties$coll, "%", "<br/>",
"Some College:", counties$somecoll, "%", "<br/>",
"High School:", counties$HS, "%", "<br/>",
"< High School:", counties$lessHS, "%")) %>%
# Second Data Layer: College and Beyond
addPolygons(group = "College and Beyond",
stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5,
color = ~colorQuantile("Purples", HS)(HS),
popup = paste("State:", counties$state ,"<br/>",
"County:",counties$NAME,"<br/>",
"Overall Education Percentile:", percent(counties$overall), "<br/>",
"College and Beyond:", counties$coll, "%", "<br/>",
"Some College:", counties$somecoll, "%", "<br/>",
"High School:", counties$HS, "%", "<br/>",
"< High School:", counties$lessHS, "%")) %>%
# Third Data Layer: Some College
addPolygons(group = "Some College",
stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5,
color = ~colorQuantile("Greens", somecoll)(somecoll),
popup = paste("State:", counties$state ,"<br/>",
"County:",counties$NAME,"<br/>",
"Overall Education Percentile:", percent(counties$overall), "<br/>",
"College and Beyond:", counties$coll, "%", "<br/>",
"Some College:", counties$somecoll, "%", "<br/>",
"High School:", counties$HS, "%", "<br/>",
"< High School:", counties$lessHS, "%")) %>%
# Fourth Data Layer: High School
addPolygons(group = "High School",
stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5,
color = ~colorQuantile("Reds", HS)(HS),
popup = paste("State:", counties$state ,"<br/>",
"County:",counties$NAME,"<br/>",
"Overall Education Percentile:", percent(counties$overall), "<br/>",
"College and Beyond:", counties$coll, "%", "<br/>",
"Some College:", counties$somecoll, "%", "<br/>",
"High School:", counties$HS, "%", "<br/>",
"< High School:", counties$lessHS, "%")) %>%
# Fourth Data Layer: Below High School
addPolygons(group = "Below High School",
stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5,
color = ~colorQuantile("Greys", lessHS)(lessHS),
popup = paste("State:", counties$state ,"<br/>",
"County:",counties$NAME,"<br/>",
"Overall Education Percentile:", percent(counties$overall), "<br/>",
"College and Beyond:", counties$coll, "%", "<br/>",
"Some College:", counties$somecoll, "%", "<br/>",
"High School:", counties$HS, "%", "<br/>",
"< High School:", counties$lessHS, "%")) %>%
# Layers Control
addLayersControl(
baseGroups = c("OpenStreetMap", "Toner", "Toner Lite"),
overlayGroups = c("Overall Education Level", "College and Beyond",
"Some College", "High School", "Below High School"),
options = layersControlOptions(collapsed = TRUE))
m